library(Biostrings)
library(gplots)
## Warning: package 'gplots' was built under R version 3.2.4
library(RColorBrewer)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.4
library(reshape2)
Tn019.fasta <- readDNAStringSet('Tn019_S2_L001_redundans/cdhit1000.fa')
tet <- oligonucleotideFrequency(Tn019.fasta, 2, as.prob = T)
rownames(tet) <- names(Tn019.fasta)
cl <- hclust(dist(tet), method='ward.D2')
plot(as.dendrogram(cl), leaflab='none')
#hmcol <- rev(colorRampPalette(brewer.pal(9, "YlGnBu"))(255))
#lab.col <- colorRampPalette(brewer.pal(9, 'RdPu'))(255)
#scaled.sizes <- trunc(width(Tn019.fasta)/max(width(Tn019.fasta))*255)
#heatmap.2(tet, dendrogram = 'row', Rowv = as.dendrogram(cl), trace = 'none', labRow = '', scale='row', col=hmcol, lhei = c(0.5, 4), RowSideColors = lab.col[scaled.sizes])
PCA plot - how well does clustering match up with this?
cl.split <- cutree(cl, 7)
table(cl.split)
## cl.split
## 1 2 3 4 5 6 7
## 1249 1155 882 841 720 752 72
tet.pca <- prcomp(tet, center = T, scale. = T)
plot(tet.pca, type='l')
scores <- data.frame(rownames(tet), tet.pca$x[,1:3], length=width(Tn019.fasta), cluster=cl.split)
qplot(x=PC1, y=PC2, data=scores, col=as.factor(cluster))
qplot(x=PC1, y=PC3, data=scores, col=as.factor(cluster))
qplot(x=PC2, y=PC3, data=scores, col=as.factor(cluster))
Map dinucleotide frequency data onto contig coverage and SNP rate
fullSummary <- read.delim('Varscan.CovSNPrate.summary.tab', header=T)
clusterData <- data.frame(Chrom = gsub('\\|size[0-9]+', '', names(Tn019.fasta)),
cluster = cl.split)
allData <- merge(subset(fullSummary, Sample=='Tn019'), clusterData, by = 'Chrom', all.x=T)
allData[is.na(allData$cluster),'cluster'] <- 0
#Coverage distributions
ggplot(allData, aes(x=medCov)) + geom_histogram(binwidth=5, alpha=0.2, col='black') + scale_x_continuous(limits = c(0, 150)) + geom_freqpoly(aes(x=medCov, col=as.factor(cluster)), binwidth=5, size=2)
## Warning: Removed 173 rows containing non-finite values (stat_bin).
## Warning: Removed 173 rows containing non-finite values (stat_bin).
## Warning: Removed 14 rows containing missing values (geom_path).
#SNP rate distrubutions
ggplot(allData, aes(x=SNPrate*100)) + geom_histogram(binwidth=0.5, alpha=0.2, col='black') + geom_freqpoly(aes(x=SNPrate*100, col=as.factor(cluster)), binwidth=0.5, size=2) + scale_x_continuous(limits = c(0,10))
## Warning: Removed 53 rows containing non-finite values (stat_bin).
## Warning: Removed 53 rows containing non-finite values (stat_bin).
## Warning: Removed 14 rows containing missing values (geom_path).
# Load SNP-only dataset
snpSummary <- read.delim('Tn019_S2_L001_v_cdhit.VarScanSNPs.tab', header=T)
snpSummary <- cbind(colsplit(snpSummary$Chrom, '\\|size', c("Chrom", "ContigLength")), snpSummary[2:19])
snpSummary$VarFreq <- as.numeric(gsub('%','',snpSummary$VarFreq))
snpSummary['MinorAlleleFreq'] <- ifelse(snpSummary$VarFreq <= 50, snpSummary$VarFreq, 100-snpSummary$VarFreq)
# Merge with cluster data (keep only SNPs that are observed on both strands)
allSNPs <- merge(subset(snpSummary, Strands1 == 2 & Strands2 == 2), clusterData, by = 'Chrom', all.x=T)
ggplot(allSNPs, aes(x=MinorAlleleFreq)) + geom_histogram(binwidth=1, alpha=0.2, col='black') + geom_freqpoly(aes(x=MinorAlleleFreq, col=as.factor(cluster)), binwidth=1, size=2)
tet <- oligonucleotideFrequency(Tn019.fasta, 4, as.prob = T)
rownames(tet) <- names(Tn019.fasta)
cl <- hclust(dist(tet), method='ward.D2')
plot(as.dendrogram(cl), leaflab='none')
cl.split <- cutree(cl, 5)
table(cl.split)
## cl.split
## 1 2 3 4 5
## 1385 1225 1279 1102 680
tet.pca <- prcomp(tet, center = T, scale. = T)
plot(tet.pca, type='l')
scores <- data.frame(rownames(tet), tet.pca$x[,1:3], length=width(Tn019.fasta), cluster=cl.split)
qplot(x=PC1, y=PC2, data=scores, col=as.factor(cluster))
qplot(x=PC1, y=PC3, data=scores, col=as.factor(cluster))
qplot(x=PC2, y=PC3, data=scores, col=as.factor(cluster))
clusterData <- data.frame(Chrom = gsub('\\|size[0-9]+', '', names(Tn019.fasta)),
cluster = cl.split)
allData <- merge(subset(fullSummary, Sample=='Tn019'), clusterData, by = 'Chrom', all.x=T)
allData[is.na(allData$cluster),'cluster'] <- 0
#Coverage distributions
ggplot(allData, aes(x=medCov)) + geom_histogram(binwidth=5, alpha=0.2, col='black') + scale_x_continuous(limits = c(0, 150)) + geom_freqpoly(aes(x=medCov, col=as.factor(cluster)), binwidth=5, size=2)
## Warning: Removed 173 rows containing non-finite values (stat_bin).
## Warning: Removed 173 rows containing non-finite values (stat_bin).
## Warning: Removed 10 rows containing missing values (geom_path).
#SNP rate distrubutions
ggplot(allData, aes(x=SNPrate*100)) + geom_histogram(binwidth=0.5, alpha=0.2, col='black') + geom_freqpoly(aes(x=SNPrate*100, col=as.factor(cluster)), binwidth=0.5, size=2) + scale_x_continuous(limits = c(0,10))
## Warning: Removed 53 rows containing non-finite values (stat_bin).
## Warning: Removed 53 rows containing non-finite values (stat_bin).
## Warning: Removed 10 rows containing missing values (geom_path).
# Merge with cluster data (keep only SNPs that are observed on both strands)
allSNPs <- merge(subset(snpSummary, Strands1 == 2 & Strands2 == 2), clusterData, by = 'Chrom', all.x=T)
ggplot(allSNPs, aes(x=MinorAlleleFreq)) + geom_histogram(binwidth=1, alpha=0.2, col='black') + geom_freqpoly(aes(x=MinorAlleleFreq, col=as.factor(cluster)), binwidth=1, size=2)